package datacrawl.test;

import java.util.List;

import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.webmagic.utils.UrlUtils;

public class EasyMoneyPageProcessor  implements PageProcessor {


    private String urlPattern;

    private Site site;

    public EasyMoneyPageProcessor(String startUrl, String urlPattern) {
        this.site = Site.me().addStartUrl(startUrl).
                setDomain(UrlUtils.getDomain(startUrl));
        //compile "*" expression to regex
        this.urlPattern = "(" + urlPattern.replace(".", "\\.").replace("*", "[^\"'#]*") + ")";

    }

    public void process(Page page) {
        List<String> requests = page.getHtml().links().regex(urlPattern).all();
        //add urls to fetch
        page.addTargetRequests(requests);
        
        page.putField("html", page.getRawText());
        //extract by XPath
//        page.putField("title", page.getHtml().xpath("//title")); 
//        page.putField("html", page.getHtml().toString());
        //extract by Readability
//        page.putField("content", page.getHtml().smartContent());
//        page.putField("每股指标", page.getHtml().xpath("//*[@id='F10MainTargetDiv']/table/tbody/tr[1]/th[1]/span"));
//      
      //*[@id="BBMX_table"]/tbody/tr[1]/th[1]/span
//        page.putField("content", page.getHtml().smartContent());
//        page.putField("每股指标", page.getHtml().xpath("//*[@id='F10MainTargetDiv']/table/tbody/tr[1]/th[1]/span"));
//        page.putField("每股收益", page.getHtml().xpath("//*[@id='F10MainTargetDiv']/table/tbody/tr[2]/td[1]/span"));
      //*[@id="F10MainTargetDiv"]/table/tbody/tr[1]/th[1]/span
    }

    public Site getSite() {
        //settings
        return site;
    }

}
